{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "source": [
        "# Merge multiple COCO annotation JSON files into one file."
      ],
      "metadata": {
        "id": "vAan5iCyEQp5"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "Given multiple COCO annotated JSON files, your goal is to merge them into one COCO annotated JSON file.\n",
        "\n",
        "A merged COCO annotated JSON file is required where all the data is in one place and it becomes easy to split it into a training and validation JSON file according to the percentage ratio. In case you already have a validated COCO annotated JSON file, then this notebook can be used to merge multiple files into one training COCO annotated JSON file."
      ],
      "metadata": {
        "id": "OXNMINymEVvW"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# Import necessary libraries\n",
        "import tqdm\n",
        "import json\n",
        "import glob"
      ],
      "metadata": {
        "id": "DWc_xka7ix0I"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "def merge_jsons(list_of_jsons):\n",
        "  \"\"\"\n",
        "  Merges a list of JSON files into a single JSON file.\n",
        "\n",
        "  Args:\n",
        "    list_of_jsons: A list of JSON files to be merged.\n",
        "\n",
        "  Returns:\n",
        "    A single JSON file containing the merged data.\n",
        "  \"\"\"\n",
        "\n",
        "  num = 1\n",
        "  image_id = 0\n",
        "  images_list = []\n",
        "  categories_list = []\n",
        "  annotations_list = []\n",
        "  labels_dict = {}\n",
        "  mapping_images = {}\n",
        "  mapping_categories = {}\n",
        "\n",
        "\n",
        "  for i,json_file_path in tqdm.tqdm(enumerate(list_of_jsons)):\n",
        "    # read JSON file\n",
        "    with open(json_file_path) as json_file:\n",
        "      read_json = json.load(json_file)\n",
        "\n",
        "    if len(read_json['images'][0]) != 0:\n",
        "      list_of_dic = []\n",
        "      list_of_dic_cat = []\n",
        "\n",
        "\n",
        "      # process images dictionary\n",
        "      for image in read_json['images']:\n",
        "        images_dict = {}\n",
        "        list_of_dic.append((image['id'], image_id))\n",
        "        images_dict['file_name'] = image['file_name']\n",
        "        images_dict['id'] = image_id\n",
        "        image_id += 1\n",
        "        images_dict['width'] = image['width']\n",
        "        images_dict['height'] = image['height']\n",
        "        images_list.append(images_dict)\n",
        "      mapping_images['file_{}'.format(i)] = dict(list_of_dic)\n",
        "\n",
        "\n",
        "      # process categories dictionary\n",
        "      for category in read_json['categories']:\n",
        "        list_of_dic_cat.append((category['id'], category['name']))\n",
        "        categories_dict = {}\n",
        "        if category['name'] not in labels_dict.keys():\n",
        "          if len(labels_dict.keys()) == 0:\n",
        "            labels_dict[read_json['categories'][0]['name']] = 1\n",
        "          else:\n",
        "            labels_dict[category['name']] = max(labels_dict.values()) + 1\n",
        "          categories_dict['supercategory'] = category['supercategory']\n",
        "          categories_dict['id'] = labels_dict[category['name']]\n",
        "          categories_dict['name'] = category['name']\n",
        "          categories_list.append(categories_dict)\n",
        "        else:\n",
        "          pass\n",
        "      mapping_categories['file_{}'.format(i)] = dict(list_of_dic_cat)\n",
        "\n",
        "\n",
        "      # process annotations dictionary\n",
        "      for annotation in read_json['annotations']:\n",
        "        annotations_dict = {}\n",
        "        annotations_dict['segmentation'] = annotation['segmentation']\n",
        "        annotations_dict['area'] = annotation['area']\n",
        "        annotations_dict['bbox'] = annotation['bbox']\n",
        "        annotations_dict['image_id'] = mapping_images['file_{}'.format(i)][annotation['image_id']]\n",
        "        annotations_dict['category_id'] = labels_dict[mapping_categories['file_{}'.format(i)][annotation['category_id']]]\n",
        "        annotations_dict['id'] = num\n",
        "        num +=1\n",
        "        annotations_dict['iscrowd'] = 0\n",
        "        annotations_list.append(annotations_dict)\n",
        "\n",
        "\n",
        "  final_json = {\n",
        "      'images':images_list,\n",
        "      'categories':categories_list,\n",
        "      'annotations':annotations_list\n",
        "      }\n",
        "  return final_json"
      ],
      "metadata": {
        "id": "7kQVyWxYqVdj"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "files = glob.glob('/mydrive/sherman/**/*.json', recursive=True)\n",
        "files"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "g-617H5ONlwk",
        "outputId": "2bb90235-f1cb-49b8-d8cc-1e1d54f0a9f8"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "['/mydrive/sherman/annotation_3.json',\n",
              " '/mydrive/sherman/annotation_2.json',\n",
              " '/mydrive/sherman/annotation_4.json',\n",
              " '/mydrive/sherman/annotation_1.json']"
            ]
          },
          "metadata": {},
          "execution_count": 20
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "data = merge_jsons(files)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "XWTy85rMEka3",
        "outputId": "85567bf2-27e0-4152-8ad1-6539b551a9fb"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "4it [00:00, 10.26it/s]\n"
          ]
        }
      ]
    }
  ]
}